packages.used=c("rvest", "tibble", "qdap",
"sentimentr", "gplots", "dplyr",
"tm", "syuzhet", "factoextra",
"beeswarm", "scales", "RColorBrewer",
"RANN", "tm", "topicmodels")
# check packages that need to be installed.
packages.needed=setdiff(packages.used,
intersect(installed.packages()[,1],
packages.used))
#print(packages.needed)
# install additional packages
if(length(packages.needed)>0){
install.packages(packages.needed) #, dependencies = TRUE)
}
# load packages
library("rvest")
library("tibble")
library("qdap")
library("sentimentr")
library("gplots")
library("dplyr")
library("tm")
library("syuzhet")
library("factoextra")
library("beeswarm")
library("scales")
library("RColorBrewer")
library("RANN")
library("tm")
library("topicmodels")
source("../lib/plotstacked.R")
source("../lib/speechFuncs.R")
This notebook was prepared with the following environmental settings.
print(R.version)
_
platform x86_64-apple-darwin13.4.0
arch x86_64
os darwin13.4.0
system x86_64, darwin13.4.0
status
major 3
minor 3.1
year 2016
month 06
day 21
svn rev 70800
language R
version.string R version 3.3.1 (2016-06-21)
nickname Bug in Your Hair
Following the example of Jerid Francom, we used Selectorgadget to choose the links we would like to scrap. For this project, we selected all inaugural addresses of past presidents, nomination speeches of major party candidates and farewell addresses. We also included several public speeches from Donald Trump for our textual analysis of presidential speeches.
### Inauguaral speeches
main.page <- read_html(x = "http://www.presidency.ucsb.edu/inaugurals.php")
# Get link URLs
# f.speechlinks is a function for extracting links from the list of speeches.
inaug=f.speechlinks(main.page)
#head(inaug)
as.Date(inaug[,1], format="%B %e, %Y")
[1] "1789-04-30" "1793-03-04" "1797-03-04" "1801-03-04" "1805-03-04"
[6] "1809-03-04" "1813-03-04" "1817-03-04" "1821-03-04" "1825-03-04"
[11] "1829-03-04" "1833-03-04" "1837-03-04" "1841-03-04" "1845-03-04"
[16] "1849-03-05" "1853-03-04" "1857-03-04" "1861-03-04" "1865-03-04"
[21] "1869-03-04" "1873-03-04" "1877-03-05" "1881-03-04" "1885-03-04"
[26] "1889-03-04" "1893-03-04" "1897-03-04" "1901-03-04" "1905-03-04"
[31] "1909-03-04" "1913-03-04" "1917-03-04" "1921-03-04" "1925-03-04"
[36] "1929-03-04" "1933-03-04" "1937-01-20" "1941-01-20" "1945-01-20"
[41] "1949-01-20" "1953-01-20" "1957-01-21" "1961-01-20" "1965-01-20"
[46] "1969-01-20" "1973-01-20" "1977-01-20" "1981-01-20" "1985-01-21"
[51] "1989-01-20" "1993-01-20" "1997-01-20" "2001-01-20" "2005-01-20"
[56] "2009-01-20" "2013-01-21" "2017-01-20" NA
inaug=inaug[-nrow(inaug),] # remove the last line, irrelevant due to error.
#### Nomination speeches
main.page=read_html("http://www.presidency.ucsb.edu/nomination.php")
# Get link URLs
nomin <- f.speechlinks(main.page)
#head(nomin)
#
#### Farewell speeches
main.page=read_html("http://www.presidency.ucsb.edu/farewell_addresses.php")
# Get link URLs
farewell <- f.speechlinks(main.page)
#head(farewell)
inaug.list=read.csv("../data/inauglist.csv", stringsAsFactors = FALSE)
nomin.list=read.csv("../data/nominlist.csv", stringsAsFactors = FALSE)
farewell.list=read.csv("../data/farewelllist.csv", stringsAsFactors = FALSE)
We assemble all scrapped speeches into one list. Note here that we don’t have the full text yet, only the links to full text transcripts.
speech.list=rbind(inaug.list, nomin.list, farewell.list)
speech.list$type=c(rep("inaug", nrow(inaug.list)),
rep("nomin", nrow(nomin.list)),
rep("farewell", nrow(farewell.list)))
speech.url=rbind(inaug, nomin, farewell)
speech.list=cbind(speech.list, speech.url)
Based on the list of speeches, we scrap the main text part of the transcript’s html page. For simple html pages of this kind, Selectorgadget is very convenient for identifying the html node that rvest can use to scrap its content. For reproducibility, we also save our scrapped speeches into our local folder as individual speech files.
# Loop over each row in speech.list
speech.list$fulltext=NA
for(i in seq(nrow(speech.list))) {
text <- read_html(speech.list$urls[i]) %>% # load the page
html_nodes(".displaytext") %>% # isloate the text
html_text() # get the text
speech.list$fulltext[i]=text
# Create the file name
filename <- paste0("../data/fulltext/",
speech.list$type[i],
speech.list$File[i], "-",
speech.list$Term[i], ".txt")
sink(file = filename) %>% # open file to write
cat(text) # write the file
sink() # close the file
}
View(speech.list)
Trump, as president-elect that has not been a politician, do not have a lot of formal speeches yet. For our textual analysis, we manually add several public transcripts from Trump: + [Transcript: Donald Trump’s full immigration speech, annotated. LA Times, 08/31/2016] (http://www.latimes.com/politics/la-na-pol-donald-trump-immigration-speech-transcript-20160831-snap-htmlstory.html) + Transcript of Donald Trump???s speech on national security in Philadelphia - The Hill, 09/07/16 + Transcript of President-elect Trump’s news conference CNBC, 01/11/2017
speech1=paste(readLines("../data/fulltext/SpeechDonaldTrump-NA.txt",
n=-1, skipNul=TRUE),
collapse=" ")
speech2=paste(readLines("../data/fulltext/SpeechDonaldTrump-NA2.txt",
n=-1, skipNul=TRUE),
collapse=" ")
speech3=paste(readLines("../data/fulltext/PressDonaldTrump-NA.txt",
n=-1, skipNul=TRUE),
collapse=" ")
Trump.speeches=data.frame(
X...President=rep("Donald J. Trump", 3),
File=rep("DonaldJTrump", 3),
Term=rep(0, 3),
Party=rep("Republican", 3),
Date=c("August 31, 2016", "September 7, 2016", "January 11, 2017"),
Words=c(word_count(speech1), word_count(speech2), word_count(speech3)),
Win=rep("yes", 3),
type=rep("speeches", 3),
links=rep(NA, 3),
urls=rep(NA, 3),
fulltext=c(speech1, speech2, speech3)
)
speech.list=rbind(speech.list, Trump.speeches)
We will use sentences as units of analysis for this project, as sentences are natural languge units for organizing thoughts and ideas. For each extracted sentence, we apply sentiment analysis using NRC sentiment lexion. “The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). The annotations were manually done by crowdsourcing.”
We assign an sequential id to each sentence in a speech (sent.id) and also calculated the number of words in each sentence as sentence length (word.count).
sentence.list=NULL
for(i in 1:nrow(speech.list)){
sentences=sent_detect(speech.list$fulltext[i],
endmarks = c("?", ".", "!", "|",";"))
if(length(sentences)>0){
emotions=get_nrc_sentiment(sentences)
word.count=word_count(sentences)
# colnames(emotions)=paste0("emo.", colnames(emotions))
# in case the word counts are zeros?
emotions=diag(1/(word.count+0.01))%*%as.matrix(emotions)
sentence.list=rbind(sentence.list,
cbind(speech.list[i,-ncol(speech.list)],
sentences=as.character(sentences),
word.count,
emotions,
sent.id=1:length(sentences)
)
)
}
}
Some non-sentences exist in raw data due to erroneous extra end-of sentence marks.
sentence.list=
sentence.list%>%
filter(!is.na(word.count))
For simpler visualization, we chose a subset of better known presidents or presidential candidates on which to focus our analysis.
sel.comparison=c("DonaldJTrump","JohnMcCain", "GeorgeBush", "MittRomney", "GeorgeWBush",
"RonaldReagan","AlbertGore,Jr", "HillaryClinton","JohnFKerry",
"WilliamJClinton","HarrySTruman", "BarackObama", "LyndonBJohnson",
"GeraldRFord", "JimmyCarter", "DwightDEisenhower", "FranklinDRoosevelt",
"HerbertHoover","JohnFKennedy","RichardNixon","WoodrowWilson",
"AbrahamLincoln", "TheodoreRoosevelt", "JamesGarfield",
"JohnQuincyAdams", "UlyssesSGrant", "ThomasJefferson",
"GeorgeWashington", "WilliamHowardTaft", "AndrewJackson",
"WilliamHenryHarrison", "JohnAdams")
First, we look at nomination acceptance speeches at major party’s national conventions. For relevant to Trump’s speeches, we limit our attention to speeches for the first terms of former U.S. presidents. We noticed that a number of presidents have very short sentences in their nomination acceptance speeches.
par(mar=c(4, 11, 2, 2))
#sel.comparison=levels(sentence.list$FileOrdered)
sentence.list.sel=filter(sentence.list,
type=="nomin", Term==1, File%in%sel.comparison)
sentence.list.sel$File=factor(sentence.list.sel$File)
sentence.list.sel$FileOrdered=reorder(sentence.list.sel$File,
sentence.list.sel$word.count,
mean,
order=T)
beeswarm(word.count~FileOrdered,
data=sentence.list.sel,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9, "Set1"), 0.6),
cex=0.55, cex.axis=0.8, cex.lab=0.8,
spacing=5/nlevels(sentence.list.sel$FileOrdered),
las=2, xlab="Number of words in a sentence.", ylab="",
main="Nomination speeches")
par(mar=c(4, 11, 2, 2))
#sel.comparison=levels(sentence.list$FileOrdered)
sentence.list.sel=filter(sentence.list,
type=="nomin", Term==2, File%in%sel.comparison)
sentence.list.sel$File=factor(sentence.list.sel$File)
sentence.list.sel$FileOrdered=reorder(sentence.list.sel$File,
sentence.list.sel$word.count,
mean,
order=T)
beeswarm(word.count~FileOrdered,
data=sentence.list.sel,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9, "Set1"), 0.6),
cex=0.55, cex.axis=0.8, cex.lab=0.8,
spacing=1.2/nlevels(sentence.list.sel$FileOrdered),
las=2, xlab="Number of words in a sentence.", ylab="",
main="Nomination speeches, 2nd term")
What are these short sentences?
sentence.list%>%
filter(File=="DonaldJTrump",
type=="nomin",
word.count<=3)%>%
select(sentences)%>%sample_n(10)
sentence.list%>%
filter(File=="AlbertGore,Jr",
type=="nomin",
word.count<=3)%>%
select(sentences)%>%sample_n(10)
sentence.list%>%
filter(File=="HillaryClinton",
type=="nomin",
word.count<=3)%>%
select(sentences)
sentence.list%>%
filter(File=="WilliamJClinton",
type=="nomin", Term==1,
word.count<=3)%>%
select(sentences)
We notice that the sentences in inaugural speeches are longer than those in nomination acceptance speeches.
sentence.list.sel=sentence.list%>%filter(type=="inaug", File%in%sel.comparison, Term==1)
sentence.list.sel$File=factor(sentence.list.sel$File)
sentence.list.sel$FileOrdered=reorder(sentence.list.sel$File,
sentence.list.sel$word.count,
mean,
order=T)
par(mar=c(4, 11, 2, 2))
beeswarm(word.count~FileOrdered,
data=sentence.list.sel,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9, "Set1"), 0.6),
cex=0.55, cex.axis=0.8, cex.lab=0.8,
spacing=5/nlevels(sentence.list.sel$FileOrdered),
las=2, ylab="", xlab="Number of words in a sentence.",
main="Inaugural Speeches")
Short sentences in inaugural speeches.
sentence.list%>%
filter(File=="BarackObama",
type=="inaug",
word.count<=3)%>%
select(sentences)
How our presidents (or candidates) alternate between long and short sentences and how they shift between different sentiments in their speeches. It is interesting to note that some presidential candidates’ speech are more colorful than others. Here we used the same color theme as in the movie “Inside Out.”
image
par(mfrow=c(4,1), mar=c(1,0,2,0), bty="n", xaxt="n", yaxt="n", font.main=1)
f.plotsent.len(In.list=sentence.list, InFile="HillaryClinton",
InType="nomin", InTerm=1, President="Hillary Clinton")
f.plotsent.len(In.list=sentence.list, InFile="DonaldJTrump",
InType="nomin", InTerm=1, President="Donald Trump")
f.plotsent.len(In.list=sentence.list, InFile="BarackObama",
InType="nomin", InTerm=1, President="Barack Obama")
f.plotsent.len(In.list=sentence.list, InFile="BarackObama",
InType="nomin", InTerm=2, President="Barack Obama")
#f.plotsent.len(In.list=sentence.list, InFile="GeorgeWBush",
# InType="nomin", InTerm=1, President="George W. Bush")
print("Hillary Clinton")
[1] "Hillary Clinton"
speech.df=tbl_df(sentence.list)%>%
filter(File=="HillaryClinton", type=="nomin", word.count>=4)%>%
select(sentences, anger:trust)
speech.df=as.data.frame(speech.df)
as.character(speech.df$sentences[apply(speech.df[,-1], 2, which.max)])
[1] "Some of you are frustrated, even furious."
[2] "It's a big deal."
[3] "Powerful forces are threatening to pull us apart."
[4] "Powerful forces are threatening to pull us apart."
[5] "It's a big deal."
[6] "My mother, Dorothy, was abandoned by her parents as a young girl."
[7] "It's a big deal."
[8] "Bonds of trust and respect are fraying."
print("Barack Obama")
[1] "Barack Obama"
speech.df=tbl_df(sentence.list)%>%
filter(File=="BarackObama", type=="nomin", Term==1, word.count>=5)%>%
select(sentences, anger:trust)
speech.df=as.data.frame(speech.df)
as.character(speech.df$sentences[apply(speech.df[,-1], 2, which.max)])
[1] "They could've heard words of anger and discord."
[2] "And that's to be expected."
[3] "It's not because John McCain doesn't care."
[4] "Now let there be no doubt."
[5] "That promise is our greatest inheritance."
[6] "Now let there be no doubt."
[7] "That's not the judgment we need."
[8] "That promise is our greatest inheritance."
print("George W Bush")
[1] "George W Bush"
speech.df=tbl_df(sentence.list)%>%
filter(File=="GeorgeWBush", type=="nomin", Term==1, word.count>=4)%>%
select(sentences, anger:trust)
speech.df=as.data.frame(speech.df)
as.character(speech.df$sentences[apply(speech.df[,-1], 2, which.max)])
[1] "On the other side of that wall are poverty and prison, addiction and despair."
[2] "We're proud of you."
[3] "But they've got it backwards."
[4] "And at the earliest possible date, my administration will deploy missile defenses to guard against attack and blackmail."
[5] "I appreciate his friendship."
[6] "On the other side of that wall are poverty and prison, addiction and despair."
[7] "They had their chance."
[8] "Corporations are responsible to treat their workers fairly and to leave the air and waters clean."
print("Donald Trump")
[1] "Donald Trump"
speech.df=tbl_df(sentence.list)%>%
filter(File=="DonaldJTrump", type=="nomin", Term==1, word.count>=5)%>%
select(sentences, anger:trust)
speech.df=as.data.frame(speech.df)
as.character(speech.df$sentences[apply(speech.df[,-1], 2, which.max)])
[1] "Once again, France is the victim of brutal Islamic terrorism."
[2] "God bless you, and good night!"
[3] "I have visited the laid-off factory workers, and the communities crushed by our horrible and unfair trade deals."
[4] "Once again, France is the victim of brutal Islamic terrorism."
[5] "God bless you, and good night!"
[6] "Three were killed, and three were very very badly injured."
[7] "God bless you, and good night!"
[8] "God bless you, and good night!"
heatmap.2(cor(sentence.list%>%filter(type=="inaug")%>%select(anger:trust)),
scale = "none",
col = bluered(100), , margin=c(6, 6), key=F,
trace = "none", density.info = "none")
par(mar=c(4, 6, 2, 1))
emo.means=colMeans(select(sentence.list, anger:trust)>0.01)
col.use=c("red2", "darkgoldenrod1",
"chartreuse3", "blueviolet",
"darkgoldenrod2", "dodgerblue3",
"darkgoldenrod1", "darkgoldenrod1")
barplot(emo.means[order(emo.means)], las=2, col=col.use[order(emo.means)], horiz=T, main="Inaugural Speeches")
presid.summary=tbl_df(sentence.list)%>%
filter(type=="nomin", File%in%sel.comparison)%>%
#group_by(paste0(type, File))%>%
group_by(File)%>%
summarise(
anger=mean(anger),
anticipation=mean(anticipation),
disgust=mean(disgust),
fear=mean(fear),
joy=mean(joy),
sadness=mean(sadness),
surprise=mean(surprise),
trust=mean(trust)
#negative=mean(negative),
#positive=mean(positive)
)
presid.summary=as.data.frame(presid.summary)
rownames(presid.summary)=as.character((presid.summary[,1]))
km.res=kmeans(presid.summary[,-1], iter.max=200,
5)
fviz_cluster(km.res,
stand=F, repel= TRUE,
data = presid.summary[,-1], xlab="", xaxt="n",
show.clust.cent=FALSE)
For topic modeling, we prepare a corpus of sentence snipets as follows. For each speech, we start with sentences and prepare a snipet with a given sentence with the flanking sentences.
corpus.list=sentence.list[2:(nrow(sentence.list)-1), ]
sentence.pre=sentence.list$sentences[1:(nrow(sentence.list)-2)]
sentence.post=sentence.list$sentences[3:(nrow(sentence.list)-1)]
corpus.list$snipets=paste(sentence.pre, corpus.list$sentences, sentence.post, sep=" ")
rm.rows=(1:nrow(corpus.list))[corpus.list$sent.id==1]
rm.rows=c(rm.rows, rm.rows-1)
corpus.list=corpus.list[-rm.rows, ]
docs <- Corpus(VectorSource(corpus.list$snipets))
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
It is in this spirit that we demand adequate provision for national defense, and we condemn the inexcusable neglect that has been shown in this matter of first national importance. We must have the strength which self-respect demands, the strength of an efficient nation ready for every emergency. Our preparation must be industrial and economical as well.
Adapted from https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/.
#remove potentially problematic symbols
docs <-tm_map(docs,content_transformer(tolower))
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
it covers by its position in the gulf the mississippi and other great waters within our extended limits, and thereby enables the united states to afford complete protection to the vast and very valuable productions of our whole western country, which find a market through those streams. by a treaty with the british government, bearing date on the 20th of october, 1818, the convention regulating the commerce between the united states and great britain, concluded on the 3d of july, 1815, which was about expiring, was revived and continued for the term of ten years from the time of its expiration. by that treaty, also, the differences which had arisen under the treaty of ghent respecting the right claimed by the united states for their citizens to take and cure fish on the coast of his britannic majesty's dominions in america, with other differences on important interests, were adjusted to the satisfaction of both parties.
#remove punctuation
docs <- tm_map(docs, removePunctuation)
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
i propose to you my friends and through you that government of all kinds big and little be made solvent and that the example be set by the president of the united states and his cabinet and talking about setting a definite example i congratulate this convention for having had the courage fearlessly to write into its declaration of principles what an overwhelming majority here assembled really thinks about the 18th amendment this convention wants repeal
#Strip digits
docs <- tm_map(docs, removeNumbers)
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
that platform will be the heart of the message i will take to the country after january th it will be the cornerstone of our republican administration fortunately we are a united party
#remove stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
will say nothing campaign might destroy chance war ended people choose november choice will clear
#remove whitespace
docs <- tm_map(docs, stripWhitespace)
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
happens democrats must will rightly morally enable states protect importation intoxicating liquor importation may violate state laws must rightly morally prevent return saloon go back dry subject finance ties togetherthe th amendment something finance tooin comprehensive planning reconstruction great credit groups including government credit list important place prize statement principle platform adopted calling letting light day issues securities foreign domestic offered sale investing public
#Stem document
docs <- tm_map(docs,stemDocument)
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
time avoid mistak help found sustain unit nation weld allianc includ greater part free world
Gengerate document-term matrices.
dtm <- DocumentTermMatrix(docs)
#convert rownames to filenames#convert rownames to filenames
rownames(dtm) <- paste(corpus.list$type, corpus.list$File,
corpus.list$Term, corpus.list$sent.id, sep="_")
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm <- dtm[rowTotals> 0, ]
corpus.list=corpus.list[rowTotals>0, ]
Run LDA
#Set parameters for Gibbs sampling
burnin <- 4000
iter <- 2000
thin <- 500
seed <-list(2003,5,63,100001,765)
nstart <- 5
best <- TRUE
#Number of topics
k <- 15
#Run LDA using Gibbs sampling
ldaOut <-LDA(dtm, k, method="Gibbs", control=list(nstart=nstart,
seed = seed, best=best,
burnin = burnin, iter = iter,
thin=thin))
terms.beta=ldaOut@beta
terms.beta=scale(terms.beta)
topics.terms=NULL
for(i in 1:k){
topics.terms=rbind(topics.terms, ldaOut@terms[order(terms.beta[i,], decreasing = TRUE)[1:7]])
}
topics.terms
ldaOut.terms
Based on the most popular terms and the most salient terms for each topic, we assign a hashtag to each topic. This part require manual setup as the topics are likely to change.
topics.hash=c("Economy", "America", "Defense", "Belief", "Election", "Patriotism", "Unity", "Government", "Reform", "Temporal", "WorkingFamilies", "Freedom", "Equality", "Misc", "Legislation")
corpus.list$ldatopic=as.vector(ldaOut.topics)
Error in as.vector(ldaOut.topics) : object 'ldaOut.topics' not found
par(mar=c(1,1,1,1))
topic.summary=tbl_df(corpus.list.df)%>%
filter(type%in%c("nomin", "inaug"), File%in%sel.comparison)%>%
select(File, Economy:Legislation)%>%
group_by(File)%>%
summarise_each(funs(mean))
Error in as_data_frame(data) : object 'corpus.list.df' not found
# [1] "Economy" "America" "Defense" "Belief"
# [5] "Election" "Patriotism" "Unity" "Government"
# [9] "Reform" "Temporal" "WorkingFamilies" "Freedom"
# [13] "Equality" "Misc" "Legislation"
par(mfrow=c(5, 1), mar=c(1,1,2,0), bty="n", xaxt="n", yaxt="n")
topic.plot=c(1, 13, 14, 15, 8, 9, 12)
print(topics.hash[topic.plot])
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeBush", type=="nomin",Term==1)%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George Bush, Nomination")
speech.df=tbl_df(corpus.list.df)%>%filter(File=="WilliamJClinton", type=="nomin", Term==1)%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Bill Clinton, Nomination")
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeWBush", type=="nomin", Term==1)%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George W Bush, Nomination")
speech.df=tbl_df(corpus.list.df)%>%filter(File=="BarackObama", type=="nomin", Term==1)%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Barack Obama, Nomination")
speech.df=tbl_df(corpus.list.df)%>%filter(File=="DonaldJTrump", type=="nomin")%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Donald Trump, Nomination")
# [1] "Economy" "America" "Defense" "Belief"
# [5] "Election" "Patriotism" "Unity" "Government"
# [9] "Reform" "Temporal" "WorkingFamilies" "Freedom"
# [13] "Equality" "Misc" "Legislation"
par(mfrow=c(5, 1), mar=c(1,1,2,0), bty="n", xaxt="n", yaxt="n")
topic.plot=c(1, 13, 14, 15, 8, 9, 12)
print(topics.hash[topic.plot])
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeBush", type=="inaug", Term==1)%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George Bush, inaugural Speeches")
speech.df=tbl_df(corpus.list.df)%>%filter(File=="WilliamJClinton", type=="inaug", Term==1)%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="William J Clinton, inaugural Speeches")
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeWBush", type=="inaug", Term==1)%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George W. Bush, inaugural Speeches")
speech.df=tbl_df(corpus.list.df)%>%filter(File=="BarackObama", type=="inaug", Term==1)%>%select(sent.id, Economy:Legislation)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Barack Obama, inaugural Speeches")
topic.plot=c(1, 13, 14, 15, 8, 9, 12)
print(topics.hash[topic.plot])
Error in print(topics.hash[topic.plot]) : object 'topics.hash' not found
speech.df=tbl_df(corpus.list.df)%>%filter(type=="nomin", word.count<20)%>%select(sentences, Economy:Legislation)
as.character(speech.df$sentences[apply(as.data.frame(speech.df[,-1]), 2, which.max)])
names(speech.df)[-1]
presid.summary=tbl_df(corpus.list.df)%>%
filter(type=="inaug", File%in%sel.comparison)%>%
select(File, Economy:Legislation)%>%
group_by(File)%>%
summarise_each(funs(mean))
presid.summary=as.data.frame(presid.summary)
rownames(presid.summary)=as.character((presid.summary[,1]))
km.res=kmeans(scale(presid.summary[,-1]), iter.max=200,
5)
fviz_cluster(km.res,
stand=T, repel= TRUE,
data = presid.summary[,-1],
show.clust.cent=FALSE)